%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from multiprocessing import Pool
import matplotlib
import scipy.stats as ss
import armine
import networkx as nx
import hashlib
from akapriori import apriori
#import mpld3
#ignore warnings
import warnings
warnings.filterwarnings('ignore')
# read all tables
aisles = pd.read_csv("aisles.csv")
departments = pd.read_csv("departments.csv")
orders = pd.read_csv("orders.csv")
products = pd.read_csv("products.csv")
order_products_prior= pd.read_csv("order_products__prior.csv")
order_products_train= pd.read_csv("order_products__train.csv")
#Checking Data Frame - Aisle
print(aisles.shape)
aisles.head()
(134, 2)
| aisle_id | aisle | |
|---|---|---|
| 0 | 1 | prepared soups salads |
| 1 | 2 | specialty cheeses |
| 2 | 3 | energy granola bars |
| 3 | 4 | instant foods |
| 4 | 5 | marinades meat preparation |
#Checking Data Frame - Departments
print(departments.shape)
departments.head()
(21, 2)
| department_id | department | |
|---|---|---|
| 0 | 1 | frozen |
| 1 | 2 | other |
| 2 | 3 | bakery |
| 3 | 4 | produce |
| 4 | 5 | alcohol |
#Checking Data Frame - Orders
print(orders.shape)
orders.head()
(3421083, 7)
| order_id | user_id | eval_set | order_number | order_dow | order_hour_of_day | days_since_prior_order | |
|---|---|---|---|---|---|---|---|
| 0 | 2539329 | 1 | prior | 1 | 2 | 8 | NaN |
| 1 | 2398795 | 1 | prior | 2 | 3 | 7 | 15.0 |
| 2 | 473747 | 1 | prior | 3 | 3 | 12 | 21.0 |
| 3 | 2254736 | 1 | prior | 4 | 4 | 7 | 29.0 |
| 4 | 431534 | 1 | prior | 5 | 4 | 15 | 28.0 |
#Checking Data Frame - Product
print(products.shape)
products.head()
(49688, 4)
| product_id | product_name | aisle_id | department_id | |
|---|---|---|---|---|
| 0 | 1 | Chocolate Sandwich Cookies | 61 | 19 |
| 1 | 2 | All-Seasons Salt | 104 | 13 |
| 2 | 3 | Robust Golden Unsweetened Oolong Tea | 94 | 7 |
| 3 | 4 | Smart Ones Classic Favorites Mini Rigatoni Wit... | 38 | 1 |
| 4 | 5 | Green Chile Anytime Sauce | 5 | 13 |
#Checking Data Frame - order_products_prior
print(order_products_prior.shape)
order_products_prior.head()
(32434489, 4)
| order_id | product_id | add_to_cart_order | reordered | |
|---|---|---|---|---|
| 0 | 2 | 33120 | 1 | 1 |
| 1 | 2 | 28985 | 2 | 1 |
| 2 | 2 | 9327 | 3 | 0 |
| 3 | 2 | 45918 | 4 | 1 |
| 4 | 2 | 30035 | 5 | 0 |
#Checking Data Frame - order_products_train
print(order_products_train.shape)
order_products_train.head()
(1384617, 4)
| order_id | product_id | add_to_cart_order | reordered | |
|---|---|---|---|---|
| 0 | 1 | 49302 | 1 | 1 |
| 1 | 1 | 11109 | 2 | 1 |
| 2 | 1 | 10246 | 3 | 0 |
| 3 | 1 | 49683 | 4 | 0 |
| 4 | 1 | 43633 | 5 | 1 |
# Join all tables to market_basketframe
concat = pd.concat([order_products_prior, order_products_train], axis = 0)
market_basket_all = pd.merge(orders, concat, on = 'order_id', how = 'left')
market_basket_all = pd.merge(market_basket_all, products, on = 'product_id', how = 'left')
market_basket_all = pd.merge(market_basket_all, aisles, on = 'aisle_id', how = 'left')
market_basket_all = pd.merge(market_basket_all,departments, on = 'department_id', how = 'left')
print(market_basket_all.shape)
market_basket_all.head()
(33894106, 15)
| order_id | user_id | eval_set | order_number | order_dow | order_hour_of_day | days_since_prior_order | product_id | add_to_cart_order | reordered | product_name | aisle_id | department_id | aisle | department | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2539329 | 1 | prior | 1 | 2 | 8 | NaN | 196.0 | 1.0 | 0.0 | Soda | 77.0 | 7.0 | soft drinks | beverages |
| 1 | 2539329 | 1 | prior | 1 | 2 | 8 | NaN | 14084.0 | 2.0 | 0.0 | Organic Unsweetened Vanilla Almond Milk | 91.0 | 16.0 | soy lactosefree | dairy eggs |
| 2 | 2539329 | 1 | prior | 1 | 2 | 8 | NaN | 12427.0 | 3.0 | 0.0 | Original Beef Jerky | 23.0 | 19.0 | popcorn jerky | snacks |
| 3 | 2539329 | 1 | prior | 1 | 2 | 8 | NaN | 26088.0 | 4.0 | 0.0 | Aged White Cheddar Popcorn | 23.0 | 19.0 | popcorn jerky | snacks |
| 4 | 2539329 | 1 | prior | 1 | 2 | 8 | NaN | 26405.0 | 5.0 | 0.0 | XL Pick-A-Size Paper Towel Rolls | 54.0 | 17.0 | paper goods | household |
market_basket_all.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 33894106 entries, 0 to 33894105 Data columns (total 15 columns): # Column Dtype --- ------ ----- 0 order_id int64 1 user_id int64 2 eval_set object 3 order_number int64 4 order_dow int64 5 order_hour_of_day int64 6 days_since_prior_order float64 7 product_id float64 8 add_to_cart_order float64 9 reordered float64 10 product_name object 11 aisle_id float64 12 department_id float64 13 aisle object 14 department object dtypes: float64(6), int64(5), object(4) memory usage: 4.0+ GB
# Check null values
# Since the prior, train, test data have been merged together, only days_since_prior_order null values will be considered.
# Other features here with 75000 null values mean that they represent the test dataset and need future prediction.
market_basket_all.isnull().sum()
order_id 0 user_id 0 eval_set 0 order_number 0 order_dow 0 order_hour_of_day 0 days_since_prior_order 2078068 product_id 75000 add_to_cart_order 75000 reordered 75000 product_name 75000 aisle_id 75000 department_id 75000 aisle 75000 department 75000 dtype: int64
# Fill null values for days_since_prior_order, null values mean that the order is the first order
market_basket_all['days_since_prior_order'] = market_basket_all['days_since_prior_order'].fillna(-1)
market_basket_all.head()
| order_id | user_id | eval_set | order_number | order_dow | order_hour_of_day | days_since_prior_order | product_id | add_to_cart_order | reordered | product_name | aisle_id | department_id | aisle | department | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2539329 | 1 | prior | 1 | 2 | 8 | -1.0 | 196.0 | 1.0 | 0.0 | Soda | 77.0 | 7.0 | soft drinks | beverages |
| 1 | 2539329 | 1 | prior | 1 | 2 | 8 | -1.0 | 14084.0 | 2.0 | 0.0 | Organic Unsweetened Vanilla Almond Milk | 91.0 | 16.0 | soy lactosefree | dairy eggs |
| 2 | 2539329 | 1 | prior | 1 | 2 | 8 | -1.0 | 12427.0 | 3.0 | 0.0 | Original Beef Jerky | 23.0 | 19.0 | popcorn jerky | snacks |
| 3 | 2539329 | 1 | prior | 1 | 2 | 8 | -1.0 | 26088.0 | 4.0 | 0.0 | Aged White Cheddar Popcorn | 23.0 | 19.0 | popcorn jerky | snacks |
| 4 | 2539329 | 1 | prior | 1 | 2 | 8 | -1.0 | 26405.0 | 5.0 | 0.0 | XL Pick-A-Size Paper Towel Rolls | 54.0 | 17.0 | paper goods | household |
# Count number of duplicates
sum(market_basket_all.duplicated())
0
#converting number of day to name
market_basket_all["order_dow"] = market_basket_all["order_dow"].apply(lambda x:"Saturday" if x==0 else x)
market_basket_all["order_dow"] = market_basket_all["order_dow"].apply(lambda x:"Sunday" if x==1 else x)
market_basket_all["order_dow"] = market_basket_all["order_dow"].apply(lambda x:"Monday" if x==2 else x)
market_basket_all["order_dow"] = market_basket_all["order_dow"].apply(lambda x:"Tuesday" if x==3 else x)
market_basket_all["order_dow"] = market_basket_all["order_dow"].apply(lambda x:"Wednesday" if x==4 else x)
market_basket_all["order_dow"] = market_basket_all["order_dow"].apply(lambda x:"Thursday" if x==5 else x)
market_basket_all["order_dow"] = market_basket_all["order_dow"].apply(lambda x:"Friday" if x==6 else x)
#Saving result data frame as a csv File
# market_basket_all.to_csv("MarketBasket.csv")
#Reading market basket initial from csv File as a data frame
# market_basket_all = pd.read_csv("MarketBasket.csv")
# Selecting prior dataset from the whole dataset
market_basket = market_basket_all[market_basket_all['eval_set'] == 'prior']
# number of products in departments/aisles
eda7 = market_basket[['product_name', 'aisle', 'department']].drop_duplicates().groupby(['aisle', 'department']).count()
eda7 = eda7.rename(columns = {'product_name':'count'}).reset_index()
fig = px.treemap(eda7, path = ['department', 'aisle', 'count'], values = 'count')
fig.update_layout(
uniformtext=dict(minsize = 30),
margin = dict(t=0, l=0, r=0, b=0)
)
fig.show()
## Department distribution
plt.figure(figsize=(15,8), dpi=100)
temp_series = market_basket['department'].value_counts()
labels = (np.array(temp_series.index))
sizes = (np.array(temp_series/temp_series.sum())*100)
plt.pie(sizes,labels = labels, autopct='%1.1f%%',startangle=200)
plt.title("Department distribution",fontsize=15)
plt.show()
# Most Popular Departments (Top 10)
plt.figure(figsize=(15,8),dpi = 100)
plt.style.use('seaborn-talk')
popular_department = market_basket['department'].value_counts().head(10)
sns.barplot(popular_department.index, popular_department.values, alpha=0.8)
plt.title("Top 10 Popular Department", weight="bold", c="red",fontsize=15)
plt.xlabel("")
plt.ylabel("Total Order",weight= "bold")
plt.yticks(weight="bold")
plt.xticks(weight="bold")
plt.show()
# Most Popular Aisle (Top 10)
plt.figure(figsize=(15,8),dpi=100),
plt.style.use('seaborn-talk')
popular_aisle = market_basket['aisle'].value_counts().head(10)
sns.barplot(popular_aisle.index, popular_aisle.values, alpha=0.8)
plt.xticks(rotation='vertical')
plt.title("Top 10 Popular Aisle", weight="bold", c="red",fontsize=15)
plt.xlabel("")
plt.ylabel("Total Order",weight= "bold")
plt.yticks(weight="bold")
plt.xticks(weight="bold")
plt.show()
# Most Popular Product (Top 10)
plt.figure(figsize=(15,8),dpi=100),
plt.style.use('seaborn-talk')
popular_product = market_basket['product_name'].value_counts().head(10)
sns.barplot(popular_product.index, popular_product.values, alpha=0.8)
plt.xticks(rotation='vertical')
plt.title("Top 10 Popular Product", weight="bold", c="red",fontsize=15)
plt.xlabel("")
plt.ylabel("Total Order",weight= "bold")
plt.yticks(weight="bold")
plt.xticks(weight="bold")
plt.show()
# Buy organic or non-organic
plt.figure(figsize=(15,8),dpi=100)
market_basket['organic'] = market_basket.product_name.str.contains('Organic').astype(np.int8)
org = market_basket.groupby('order_id')['organic'].aggregate("max").value_counts()
sns.barplot(org.index, org / org.sum() * 100)
plt.title("Organic vs Non-Organic", weight="bold", c="red",fontsize=15)
plt.xlabel("")
plt.ylabel("Order Percentage",weight= "bold")
plt.yticks(weight="bold")
plt.xticks(weight="bold")
plt.show()
# Frequency or Order Based on Hours
fig, ax = plt.subplots(figsize=(15,8), dpi = 100)
order_hours = market_basket[['order_id', 'order_hour_of_day']].drop_duplicates()
ax = sns.countplot(x = 'order_hour_of_day', data = order_hours)
plt.title("Order Frequency Based on Hours", weight="bold", c="red",fontsize=15)
plt.ylabel("Number of Order",weight= "bold")
plt.xlabel('Order Time (hours)', fontsize=13, weight="bold")
x_coordinates = [-0.5, 23.5]
avg = market_basket[['order_id', 'order_hour_of_day']].drop_duplicates().groupby(["order_hour_of_day"])['order_id'].count().median()
y_avg = [avg, avg]
plt.yticks(weight="bold")
plt.xticks(weight="bold")
plt.text(-0.5,150000,'Average',fontsize = 13,backgroundcolor = 'gray',color = 'white')
plt.plot(x_coordinates, y_avg,linestyle = '--', c="gray")
plt.show()
# Frequency of Order Based on Days
dow = market_basket[['order_id', 'order_dow']].drop_duplicates()
plt.figure(figsize=(15,8),dpi=100),
plt.style.use('seaborn-talk')
ax = sns.countplot(x="order_dow",data=dow,order=["Monday","Tuesday","Wednesday","Thursday","Friday","Saturday","Sunday"])
plt.title("Order Frequency Based on Days of Week", weight="bold", c="red",fontsize=15)
plt.xlabel("")
plt.ylabel("Number of Order",weight= "bold")
plt.yticks(weight="bold")
plt.xticks(weight="bold")
x_coordinates = [0, 6]
avg = market_basket[['order_id', 'order_dow']].drop_duplicates().groupby(["order_dow"])['order_id'].count().mean()
y_avg = [avg, avg]
plt.text(2.5,500000,'Average',fontsize = 13,backgroundcolor = 'gray',color = 'white')
plt.plot(x_coordinates, y_avg, linestyle = '--', c="black")
plt.show()
# Heatmap of Frequency or Order Based on Days & Hours
cats = [ 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
grouped_df = market_basket.groupby(["order_dow", "order_hour_of_day"])["order_number"].aggregate("count").reset_index()
grouped_df = grouped_df.pivot('order_dow', 'order_hour_of_day', 'order_number').reindex(cats)
plt.figure(figsize=(15,8), dpi = 100)
sns.heatmap(grouped_df,cmap="Oranges")
plt.title("Order Frequency of Day and Hour",weight="bold", c="red",fontsize=15)
plt.ylabel("")
plt.xlabel("Hours",weight="bold")
plt.yticks(weight="bold")
plt.xticks(weight="bold")
plt.show()
# Number of days since prior order
fig, ax = plt.subplots(dpi = 100, figsize=(15,8))
days_since_prior_order = market_basket[market_basket['days_since_prior_order'].notnull()][['order_id', 'days_since_prior_order']].drop_duplicates()
ax = sns.countplot(x = 'days_since_prior_order', data = days_since_prior_order)
ax.set_xlabel('Days since prior order', weight="bold", fontsize=13)
plt.ylabel("Number of Orders",weight= "bold", fontsize=13)
plt.yticks(weight="bold")
plt.xticks(weight="bold")
plt.title("Number of days since prior order", weight="bold", c="red",fontsize=15)
plt.show()
# Number of users for each number of total orders
fig, ax = plt.subplots(dpi = 100, figsize=(15,8))
num_order = market_basket.sort_values(['order_number']).drop_duplicates(['user_id'],keep='last')[['user_id', 'order_number']]
num_of_order = num_order.groupby('order_number').count()
sns.lineplot(x = 'order_number', y = 'user_id', data = num_of_order, marker = 'o')
plt.title("Number of orders",weight="bold", c="red",fontsize=15)
plt.ylabel("Number of users", weight="bold")
plt.xlabel("Number of orders",weight="bold")
plt.yticks(weight="bold")
plt.xticks(weight="bold")
plt.show()
# Number of users for each number of total orders interval
fig, ax = plt.subplots(dpi = 100, figsize=(15,8))
order_num = market_basket.sort_values(['order_number']).drop_duplicates(['user_id'],keep='last')['order_number']
order_num_cut = pd.cut(order_num, bins = [0, 5, 10, 15, 20, 30, 40, 50, 60, 70, 80, 90, 100], right = True).to_frame()
sns.countplot(x = 'order_number',data = order_num_cut)
plt.title("Number of orders interval",weight="bold", c="red",fontsize=15)
plt.ylabel("Number of users", weight="bold", fontsize=13)
plt.xlabel("Number of orders interval",weight="bold", fontsize=13)
plt.yticks(weight="bold")
plt.xticks(weight="bold")
plt.show()
# Top 10 products which sell most and reorder most
fig, ax = plt.subplots(figsize=(15,8), dpi = 100)
order = market_basket[['order_id','product_name']].groupby('product_name').count().sort_values('order_id', ascending = False).head(10)
order = order.rename(columns = {'order_id':'count'}).reset_index()
reorder = market_basket[market_basket['reordered'] == 1][['order_id','product_name']].groupby('product_name').count().sort_values('order_id', ascending = False).head(10)
reorder = reorder.rename(columns = {'order_id':'count'}).reset_index()
sns.barplot(y = 'product_name', x = 'count', data = order, orient = 'h', color = 'r', label = 'total')
sns.barplot(y = 'product_name', x = 'count', data = reorder, orient = 'h', color = 'b', label = 'reordered')
plt.title("Reorders compared with total orders From Top 10 Popular Products",weight="bold", c="red",fontsize=15)
plt.ylabel("Product", weight="bold", fontsize=13)
plt.xlabel("Orders Count",weight="bold", fontsize=13)
plt.yticks(weight="bold")
plt.xticks(weight="bold")
plt.legend(loc = 4)
plt.show()
# Top 10 aisles which sell most and reorder most
temp_aisle = market_basket.groupby("aisle")["reordered"].agg(['count', 'sum']).rename(columns = {'count':'total','sum':'reorders'})
temp_aisle = temp_aisle.sort_values('total', ascending=False).reset_index()
fig, ax = plt.subplots(figsize = (15,8), dpi = 100)
color = sns.color_palette()
sns.barplot(y = temp_aisle.aisle[0:10], x = temp_aisle.total[0:20], color = color[1], label = "total")
sns.barplot(y = temp_aisle.aisle[0:10], x = temp_aisle.reorders[0:20], color = color[4], label = "reordered")
plt.title("Reorders compared with total orders From Top 10 Popular Aisles",weight="bold", c="red",fontsize=15)
plt.ylabel("Aisle", weight="bold", fontsize=13)
plt.xlabel("Orders Count",weight="bold", fontsize=13)
plt.yticks(weight="bold")
plt.xticks(weight="bold")
plt.legend(loc = 4)
plt.show()
# Top 10 departments which sell most and reorder most
temp_dep = market_basket.groupby("department")["reordered"].agg(['count', 'sum']).rename(columns = {'count':'total','sum':'reorders'})
temp_dep = temp_dep.sort_values('total', ascending=False).reset_index()
fig, ax = plt.subplots(figsize = (15,8), dpi = 100)
color = sns.color_palette()
sns.barplot(y = temp_dep.department[0:10], x = temp_dep.total[0:20], color = color[0], label = "total")
sns.barplot(y = temp_dep.department[0:10], x = temp_dep.reorders[0:20], color = color[3], label = "reordered")
plt.title("Reorders compared with total orders From Top 10 Popular Departments",weight="bold", c="red",fontsize=15)
plt.ylabel("Department", weight="bold", fontsize=13)
plt.xlabel("Orders Count",weight="bold", fontsize=13)
plt.yticks(weight="bold")
plt.xticks(weight="bold")
plt.legend(loc = 4)
plt.show()
# Reorder percentage of each dept
grouped_df = market_basket.groupby(["department"])["reordered"].aggregate("mean").reset_index()
plt.figure(figsize=(15,8), dpi = 100)
sns.pointplot(grouped_df['department'].values, grouped_df['reordered'].values, alpha=0.8)
plt.ylabel('Reorder ratio', fontsize=13, weight="bold")
plt.xlabel('Department', fontsize=13, weight="bold")
plt.title("Reorder ratio by Department", weight="bold", c="red", fontsize=15)
plt.xticks(rotation='vertical')
plt.yticks(weight="bold")
plt.xticks(weight="bold")
plt.show()
# Personal care has lowest reorder ratio and dairy eggs have highest reorder ratio
## Add to cart - reorder ratio
market_basket2 = market_basket.copy()
market_basket2["add_to_cart_order_mod"] = market_basket2["add_to_cart_order"].copy()
grouped_df = market_basket2.groupby(['add_to_cart_order_mod'])['reordered'].aggregate("mean").reset_index()
plt.figure(figsize=(15,8), dpi = 100)
sns.pointplot(grouped_df["add_to_cart_order_mod"].values,grouped_df['reordered'].values,alpha=0.8)
plt.xlabel("Add to cart order", fontsize=13, weight="bold")
plt.ylabel("Reorder ratio", fontsize=13, weight="bold")
plt.title("Add to cart order - Reorder ratio", fontsize=15, c="red", weight="bold")
plt.xticks(rotation="vertical")
plt.xlim(0,69)
plt.ylim(0.3,0.7)
plt.show()
# products that are added to the cart initially are more likely to be reordered again compared to the ones added later.
# Reorder ratio across hour of day
grouped_df2 = market_basket_all.groupby(["order_hour_of_day"])["reordered"].aggregate("mean").reset_index()
plt.figure(figsize=(15,8), dpi = 100)
sns.barplot(grouped_df2["order_hour_of_day"].values, grouped_df2["reordered"].values, alpha=0.8)
plt.xlabel("Hour of day", fontsize=13, weight="bold")
plt.ylabel("Reorder ratio", fontsize=13, weight="bold")
plt.title("Reorder ratio across hour of day", weight="bold", c="red", fontsize=15)
plt.yticks(weight="bold")
plt.xticks(weight="bold")
plt.ylim(0.5,0.7)
plt.show()
# Reorder Ratio of Day and Hour
cats = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
heatmap_reorder = market_basket.groupby(["order_dow", "order_hour_of_day"])["reordered"].aggregate("mean").reset_index()
heatmap_reorder = heatmap_reorder.pivot('order_dow', 'order_hour_of_day', 'reordered').reindex(cats)
plt.figure(figsize=(15,8), dpi = 100)
sns.heatmap(heatmap_reorder,cmap="Oranges")
plt.title("Reorder Ratio of Day and Hour", weight="bold", c="red", fontsize=15)
plt.ylabel("")
plt.xlabel("Hours",weight="bold")
plt.yticks(weight="bold")
plt.xticks(weight="bold")
plt.show()
# use aisle to segment the customers, so we do a crosstab between user_id and aisles
cust_aisle = pd.crosstab(market_basket['user_id'], market_basket['aisle'])
cust_aisle.head(5)
| aisle | air fresheners candles | asian foods | baby accessories | baby bath body care | baby food formula | bakery desserts | baking ingredients | baking supplies decor | beauty | beers coolers | ... | spreads | tea | tofu meat alternatives | tortillas flat bread | trail mix snack mix | trash bags liners | vitamins supplements | water seltzer sparkling water | white wines | yogurt |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| user_id | |||||||||||||||||||||
| 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
| 2 | 0 | 3 | 0 | 0 | 0 | 0 | 2 | 0 | 0 | 0 | ... | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 2 | 0 | 42 |
| 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 4 | 1 | 0 | 0 | 0 | 0 | 0 | 2 | 0 | 0 |
| 4 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 |
| 5 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 |
5 rows × 134 columns
# apply PCA to do the dimension reduction since there are 134 columns
from sklearn.decomposition import PCA
pca = PCA()
pca.fit(cust_aisle)
PCA()
# draw the fraction of explained variance
fig, ax = plt.subplots(figsize=(10,6))
ax.set_xlabel('Dimension #')
ax.set_ylabel('Explained Variance Ratio')
ax.set_title('Fraction of Explained Variance')
ax.plot(pca.explained_variance_ratio_)
plt.show()
# draw the cumulative fraction of explained variance
fig, ax = plt.subplots(figsize=(10,6))
ax.set_xlabel('Dimension #')
ax.set_ylabel('Cumulative Explained Variance Ratio')
ax.set_title('Cumulative Fraction of Explained Variance')
ax.plot(np.cumsum(pca.explained_variance_ratio_))
plt.xlim(0, 30)
plt.show()
# when dimension = 12, cumulative explained variance ratio > 0.8
print(np.cumsum(pca.explained_variance_ratio_)[11])
0.8071629834551421
# choose 12 dimensions to apply PCA
pca = PCA(n_components = 12)
reduced_cust_aisle = pca.fit_transform(cust_aisle)
# Apply KMeans to cluster our customers
from sklearn.cluster import KMeans
wcss = []
for i in range(1, 11):
kmeans = KMeans(n_clusters = i, n_init = 10, random_state = 0)
kmeans.fit(reduced_cust_aisle)
wcss.append(kmeans.inertia_)
#Plotting the results onto a line graph to observe 'The elbow'
fig, ax = plt.subplots(figsize=(10,6))
plt.plot(range(1, 11), wcss)
plt.title('Elbow Method - Customer Aisle Crosstab')
plt.xlabel('Number of clusters')
plt.ylabel('Inertia') #within cluster sum of squares
plt.show()
kmeans = KMeans(n_clusters = 4, n_init = 10, random_state = 26)
labels = kmeans.fit_predict(reduced_cust_aisle)
cols = ['Component_A', 'Component_B', 'Cluster']
# Get cluster labels and assign plotting colors/labels.
plbls = set(labels)
pclrs = sns.hls_palette(len(plbls))
pcls = ['Class {0}'.format(idx) for idx in plbls] # ['Class 0', 'Class 1', 'Class 2', 'Class 3', 'Class 4']
# Predicted Clusters
pc = pd.DataFrame(np.concatenate((reduced_user_aisle[:, [0,1]], labels.reshape((206209 , 1))), axis=1), columns = cols)
# Plot
sns.set(font_scale=2.0)
fig, ax = plt.subplots(figsize=(20, 15))
# Plot KMeans clusters
for idx in list(plbls):
tmp_pdf = pc[pc['Cluster'] == idx]
ax.scatter(tmp_pdf['Component_A'], tmp_pdf['Component_B'], color=pclrs[idx], label=pcls[idx], alpha=0.8, s = 20)
# Plotting the centroids of the clusters
ax.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:,1], color = 'black', label = 'Centroid', alpha=1, s=300)
ax.set_xlabel('Component_A')
ax.set_ylabel('Component_B')
ax.set_title('KMeans - User Aisle Crosstab')
ax.legend(bbox_to_anchor=(1, 1), loc=2)
sns.despine(offset=5, trim=True)
sns.set(font_scale=1.0)
from sklearn.manifold import TSNE
model = TSNE(learning_rate = 252, perplexity = 12, random_state = 0)
tsne_features = model.fit_transform(reduced_user_aisle)
xs = tsne_features[:,0]
ys = tsne_features[:,1]
fig, ax = plt.subplots(figsize=(50, 30), dpi = 200)
plt.scatter(xs, ys, alpha = 1, s = 20, c = labels, cmap = 'viridis')
plt.title('TSNE')
plt.show()